import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("C:/Users/abhishek/OneDrive/Desktop/dt/dataset.csv")
data.head()
| Marital status | Application mode | Application order | Course | Daytime/evening attendance | Previous qualification | Nacionality | Mother's qualification | Father's qualification | Mother's occupation | ... | Curricular units 2nd sem (credited) | Curricular units 2nd sem (enrolled) | Curricular units 2nd sem (evaluations) | Curricular units 2nd sem (approved) | Curricular units 2nd sem (grade) | Curricular units 2nd sem (without evaluations) | Unemployment rate | Inflation rate | GDP | Target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 6 | 1 | 11 | 1 | 1 | 1 | 1 | 3 | 4 | ... | 0 | 6 | 6 | 6 | 13.666667 | 0 | 13.9 | -0.3 | 0.79 | Enrolled |
| 1 | 1 | 8 | 2 | 15 | 1 | 1 | 1 | 23 | 27 | 6 | ... | 0 | 6 | 10 | 5 | 12.400000 | 0 | 9.4 | -0.8 | -3.12 | Enrolled |
| 2 | 2 | 12 | 1 | 3 | 0 | 1 | 1 | 22 | 28 | 10 | ... | 0 | 6 | 6 | 6 | 13.000000 | 0 | 13.9 | -0.3 | 0.79 | Enrolled |
| 3 | 2 | 12 | 1 | 17 | 0 | 12 | 1 | 22 | 27 | 10 | ... | 0 | 5 | 17 | 5 | 11.500000 | 5 | 16.2 | 0.3 | -0.92 | Enrolled |
| 4 | 1 | 1 | 1 | 12 | 1 | 1 | 1 | 13 | 28 | 8 | ... | 0 | 8 | 8 | 8 | 14.345000 | 0 | 15.5 | 2.8 | -4.06 | Enrolled |
5 rows × 35 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4424 entries, 0 to 4423 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Marital status 4424 non-null int64 1 Application mode 4424 non-null int64 2 Application order 4424 non-null int64 3 Course 4424 non-null int64 4 Daytime/evening attendance 4424 non-null int64 5 Previous qualification 4424 non-null int64 6 Nacionality 4424 non-null int64 7 Mother's qualification 4424 non-null int64 8 Father's qualification 4424 non-null int64 9 Mother's occupation 4424 non-null int64 10 Father's occupation 4424 non-null int64 11 Displaced 4424 non-null int64 12 Educational special needs 4424 non-null int64 13 Debtor 4424 non-null int64 14 Tuition fees up to date 4424 non-null int64 15 Gender 4424 non-null int64 16 Scholarship holder 4424 non-null int64 17 Age at enrollment 4424 non-null int64 18 International 4424 non-null int64 19 Curricular units 1st sem (credited) 4424 non-null int64 20 Curricular units 1st sem (enrolled) 4424 non-null int64 21 Curricular units 1st sem (evaluations) 4424 non-null int64 22 Curricular units 1st sem (approved) 4424 non-null int64 23 Curricular units 1st sem (grade) 4424 non-null float64 24 Curricular units 1st sem (without evaluations) 4424 non-null int64 25 Curricular units 2nd sem (credited) 4424 non-null int64 26 Curricular units 2nd sem (enrolled) 4424 non-null int64 27 Curricular units 2nd sem (evaluations) 4424 non-null int64 28 Curricular units 2nd sem (approved) 4424 non-null int64 29 Curricular units 2nd sem (grade) 4424 non-null float64 30 Curricular units 2nd sem (without evaluations) 4424 non-null int64 31 Unemployment rate 4424 non-null float64 32 Inflation rate 4424 non-null float64 33 GDP 4424 non-null float64 34 Target 4424 non-null object dtypes: float64(5), int64(29), object(1) memory usage: 1.2+ MB
print(data["Target"].unique())
['Enrolled' 'Graduate' 'Dropout']
data['Target'] = data['Target'].map({
'Dropout':0,
'Enrolled':1,
'Graduate':2
})
print(data["Target"].unique())
[1 2 0]
data.corr()['Target']
Marital status 0.074310 Application mode 0.110086 Application order -0.027393 Course 0.009877 Daytime/evening attendance -0.066439 Previous qualification 0.068021 Nacionality -0.009248 Mother's qualification 0.071100 Father's qualification 0.033291 Mother's occupation -0.077593 Father's occupation -0.094916 Displaced -0.070649 Educational special needs -0.003751 Debtor 0.154802 Tuition fees up to date -0.342121 Gender 0.118454 Scholarship holder -0.114517 Age at enrollment 0.201806 International -0.015893 Curricular units 1st sem (credited) 0.002464 Curricular units 1st sem (enrolled) -0.052020 Curricular units 1st sem (evaluations) -0.125278 Curricular units 1st sem (approved) -0.290243 Curricular units 1st sem (grade) -0.349652 Curricular units 1st sem (without evaluations) 0.021565 Curricular units 2nd sem (credited) 0.002427 Curricular units 2nd sem (enrolled) -0.060670 Curricular units 2nd sem (evaluations) -0.194412 Curricular units 2nd sem (approved) -0.351135 Curricular units 2nd sem (grade) -0.429214 Curricular units 2nd sem (without evaluations) 0.040991 Unemployment rate 0.037279 Inflation rate 0.021798 GDP -0.037052 Target 1.000000 Name: Target, dtype: float64
#correlation heatmap
plt.figure(figsize=(30, 30))
sns.heatmap(data.corr() , annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
#lowest features
correlations = data.corr()['Target']
top_10_features = correlations.abs().nsmallest(10).index
top_10_corr_values = correlations[top_10_features]
for i in range(10):
print(top_10_features[i],'-',top_10_corr_values[i])
Curricular units 2nd sem (credited) - 0.0024272668160775935 Curricular units 1st sem (credited) - 0.0024639396702293673 Educational special needs - -0.0037511261997920774 Nacionality - -0.009247826022182699 Course - 0.009877475210117816 International - -0.015893341271952886 Curricular units 1st sem (without evaluations) - 0.021564930610866943 Inflation rate - 0.021797568994223828 Application order - -0.027392621567232125 Father's qualification - 0.0332911238103344
#barplot for lowest features
plt.figure(figsize=(10, 11))
plt.bar(top_10_features, top_10_corr_values)
plt.xlabel('Features')
plt.ylabel('Correlation with Target')
plt.title('Top 10 Features with Lowest correlation Correlation to Target')
plt.xticks(rotation=45)
plt.show()
#dropping few columns in our model because they are insignificant
new_data = data.copy()
new_data = new_data.drop(columns=['Nacionality',
'Mother\'s qualification',
'Father\'s qualification',
'Educational special needs',
'International',
'Curricular units 1st sem (without evaluations)',
'Unemployment rate',
'Inflation rate'], axis=1)
new_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4424 entries, 0 to 4423 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Marital status 4424 non-null int64 1 Application mode 4424 non-null int64 2 Application order 4424 non-null int64 3 Course 4424 non-null int64 4 Daytime/evening attendance 4424 non-null int64 5 Previous qualification 4424 non-null int64 6 Mother's occupation 4424 non-null int64 7 Father's occupation 4424 non-null int64 8 Displaced 4424 non-null int64 9 Debtor 4424 non-null int64 10 Tuition fees up to date 4424 non-null int64 11 Gender 4424 non-null int64 12 Scholarship holder 4424 non-null int64 13 Age at enrollment 4424 non-null int64 14 Curricular units 1st sem (credited) 4424 non-null int64 15 Curricular units 1st sem (enrolled) 4424 non-null int64 16 Curricular units 1st sem (evaluations) 4424 non-null int64 17 Curricular units 1st sem (approved) 4424 non-null int64 18 Curricular units 1st sem (grade) 4424 non-null float64 19 Curricular units 2nd sem (credited) 4424 non-null int64 20 Curricular units 2nd sem (enrolled) 4424 non-null int64 21 Curricular units 2nd sem (evaluations) 4424 non-null int64 22 Curricular units 2nd sem (approved) 4424 non-null int64 23 Curricular units 2nd sem (grade) 4424 non-null float64 24 Curricular units 2nd sem (without evaluations) 4424 non-null int64 25 GDP 4424 non-null float64 26 Target 4424 non-null int64 dtypes: float64(3), int64(24) memory usage: 933.3 KB
new_data['Target'].value_counts()
Target 1 2209 2 1421 0 794 Name: count, dtype: int64
x = new_data['Target'].value_counts().index
y = new_data['Target'].value_counts().values
df = pd.DataFrame({
'Target': x,
'Count_T' : y
})
fig = px.pie(df,
names='Target',
values='Count_T',
title='How many dropouts, enrolled & graduates are there in Target column')
fig.update_traces(labels=['Enrolled', 'Graduated', 'Dropout'], hole=0.4, textinfo='value+label', pull=[0, 0.2, 0.1])
fig.show()
C:\Users\abhishek\anaconda3_1\Lib\site-packages\numpy\core\numeric.py:2468: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
#top 10 highest features
correlations = data.corr()['Target']
top_10_features = correlations.abs().nlargest(10).index
top_10_corr_values = correlations[top_10_features]
for i in range(10):
print(top_10_features[i],'-',top_10_corr_values[i])
Target - 1.0 Curricular units 2nd sem (grade) - -0.4292142441814479 Curricular units 2nd sem (approved) - -0.35113529228045165 Curricular units 1st sem (grade) - -0.34965164276834676 Tuition fees up to date - -0.342120547448968 Curricular units 1st sem (approved) - -0.29024327759856267 Age at enrollment - 0.20180618945687442 Curricular units 2nd sem (evaluations) - -0.1944116159981643 Debtor - 0.15480160305546845 Curricular units 1st sem (evaluations) - -0.1252778058589179
#bar plot for that
plt.figure(figsize=(10, 11))
plt.bar(top_10_features, top_10_corr_values)
plt.xlabel('Features')
plt.ylabel('Correlation with Target')
plt.title('Top 10 Features with Highest Correlation to Target')
plt.xticks(rotation=45)
plt.show()
px.histogram(new_data['Age at enrollment'], x='Age at enrollment',color_discrete_sequence=['lightblue'])
sns.boxplot(x='Target', y='Age at enrollment', data=new_data)
plt.xlabel('Target')
plt.ylabel('Age')
plt.title('Relationship between Age and Target')
plt.show()
# packages needed for models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
#training and testing the models
X = new_data.drop('Target', axis=1)
y = new_data['Target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_test_array = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
#introduction of various models
dtree = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=2)
lr = LogisticRegression(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1, random_state=0)
xbc = XGBClassifier(tree_method='hist')
svm = svm.SVC(kernel='linear',probability=True)
#testing of various models
dtree.fit(X_train,y_train)
rfc.fit(X_train,y_train)
lr.fit(X_train,y_train)
knn.fit(X_train,y_train)
abc.fit(X_train, y_train)
xbc.fit(X_train, y_train)
svm.fit(X_train, y_train)
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
SVC(kernel='linear', probability=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(kernel='linear', probability=True)
#decision tree accuracy
y_pred = dtree.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 66.55 %
#random forest accuracy
y_pred = rfc.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 77.4 %
#logistic regression accuracy
y_pred = lr.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 77.06 %
#kneighbors accuracy
y_pred = knn.predict(X_test_array)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
Accuracy : 69.6 %
#adaboost
y_pred = abc.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 74.92 %
#xgb
y_pred = xbc.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 76.84 %
#svm
y_pred = svm.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
Accuracy : 76.95 %
#soft voting
ens1 = VotingClassifier(estimators=[('rfc', rfc), ('lr', lr), ('abc',abc), ('xbc',xbc)], voting='soft')
ens1.fit(X_train, y_train)
y_pred = ens1.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Accuracy : 79.32 %
#hard voting
ens2 = VotingClassifier(estimators=[('rfc', rfc), ('lr', lr), ('abc',abc), ('xbc',xbc)], voting='hard')
ens2.fit(X_train, y_train)
y_pred = ens2.predict(X_test)
print("Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
C:\Users\abhishek\anaconda3_1\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Accuracy : 77.4 %